#####
# Crossing borders: update time-invariant data to 2019-vintage municipalities
#####

library(haven)
library(here)
library(data.table)

# There are four datasets to read in and update:
# * Indicators for inclusion in the border region
# * Measures of travel time to the border
# * Information on time-constant municipality features (many based on 2000 census)

# We need two data.tables from muniPopCleaning.R to make this work:
# * cw: the c-ross w-walk that updates historic municipality vintages to contemporary vintages. Covers 1960-2019.
# * muniPop_t: total populations by year for historic municipality vintages. From 1991-2005 it has 2005 vintage muni; afterwards it has the contemporary vintages. Goes to 2018.

###
# 1. Add the 2019 bfs numbers to the border region data
br <- read_dta(here("data",  "cleaning", "auxiliary_files", "border_region_identifiers_bigotta.dta")); setDT(br) #Provided by AB; includes BfS numbers up to 2011-12-31.

# br includes 9 bfs numbers that are unpopulated forests and other "shared areas". Drop these.
forests <- c(2285, 2391, 5020, 5391, 5392, 5393, 5394, 6072, 6391)
br <- br[!bfsnr %in% forests]; rm(forests)

# includes the bfs codes for 8 municipalities that left Bern to form Jura in 1978. Since this is long before our study, just drop these.
bern_to_jura <- c(484:489, 528, 529)
br <- br[!bfsnr %in% bern_to_jura]; rm(bern_to_jura)

br[unique(cw[,.(bfs,bfs19)]), on = c(bfsnr="bfs"), bfs19 := bfs19] 
#saveRDS(br, "../data/br.rds")
#Note: this adds three rows to the data for 4555, 4915, and 4511: three muni in Thurgau that for various reasons spread across multiple 2019-vintage municipalities. In this case, it is not an issue: they are all in Thurgau which is entirely in the border region.

# now we create the 2019 vintage version of the border region identifiers.
# There is an issue: four 2019-vintage municipalities (3668, 5749. 5634, 5938) are composed of municipalities that were and were not part of the border region. 
# In all four cases the municipality that orginally has that number was part of the border region. So just code these all as part of the border region.
br19 <- br[ , .(BR = max(BR)), by = bfs19]

###
# 2. update the travel time data.

tt <- read_dta(here("data", "cleaning", "auxiliary_files", "traveltime_originMUN2012_bordercrossingsMUN.dta")); setDT(tt) # this uses the 2012-01-01 BfS numbers

# there are 11 muni in this data that disappeared between 2012-01-01 and 2012-12-31. 
# Start by adding in the 2012-01-01 pop data for ALL EXCEPT THESE, then do those using the 2011-12-31 measures.
# (add the bfs19 numbers for both at the same time)
tt[muniPop_t[year==2012], on = c(munnr2012="bfs"), `:=`(pop = popJan, bfs19 = bfs19)]
tt[muniPop_t[year==2011, .(bfs, bfs19, popDec, pop=NA)], on = c(munnr2012="bfs", pop="pop"), `:=`(pop = popDec, bfs19 = i.bfs19)]

# now create weighted averages of all the distance measures
dist_vars <- c("distmin", "distkm", "travelMUNmin", "travelMUNkm", "travelEXACTmin", "travelEXACTkm")

tt[ , weights_2019 := pop/sum(pop), keyby = bfs19]

fwrite(tt[, .(munnr2012, bfs19, weights_2019)], here("data", "cw_munnr12_bfs19.csv"))

tt19 <- tt[ , lapply(.SD, weighted.mean, w = weights_2019), by = bfs19, .SDcols = dist_vars]; rm(dist_vars)

###
# 3. time-constant municipality information and 2000 census measures.

id <- fread(here("data", "cleaning", "MuniCodes.csv"), encoding = "UTF-8") # based on steinhauser_history2012_bfsnr2012_longfile. Has historic bfs and the 2012-01-01 bfs numbers.

# Incudes bfsnr 4875 & 4960 which do not exist on the bfs website. 
# These appear to be ortsgemeinde from Thurgau that were absorbed during the 1990s mutation spree. Drop these.
lost_muni <- c(4875, 4960)
id <- id[!bfsnr %in% lost_muni]; rm(lost_muni)

# The three double entery Thurgau municipalities are problems here as well. The id data has them as follows (bfs -> bfs2012): 4511->4511, 4915->4911, 4555->4551.
# Solution: adjust the cw to only include those cases
doubles_logic <- "!((bfs==4511 & bfs19 != 4511) | (bfs==4915 & bfs19 != 4911) | (bfs==4555 & bfs19 != 4551))"
id[unique(cw[eval(parse(text=doubles_logic)),.(bfs,bfs19)]), on = c(bfsnr="bfs"), bfs19 := bfs19]; rm(doubles_logic)

# We only need a few variables from id: canton_name, canton_nr, languagereg2000.
# Canton names and numbers are constant for each bfs19 number. Good. 
# language region is not. Solution: use the modal language among constituent municipalities.

id19 <- id[ , .(canton_name = unique(canton_name), canton_nr = unique(canton_nr), languagereg2019 = Mode(languagereg2000)), by = bfs19]

###
# 4. Merge all of these 2019-vintange time-constant measures together
tc <- id19[br19[tt19, , on = .NATURAL], , on = .NATURAL]

# remove the old 2019 version data
rm(id19, br19,tt19)

###
# 5. Create the variables necessary for our analysis

set(tc, j = "border0", value = as.integer(tc$distkm == 0))
set(tc, j = "border5", value = as.integer(tc$travelMUNmin < 5))
set(tc, j = "border15", value = as.integer(tc$travelMUNmin < 15))
set(tc, j = "border30", value = as.integer(tc$travelMUNmin %between% c(15,30)))


message(
  "You've created the following data.tables:
  * tc: This has the 2019-vintage version of all our time constant variables. Includes the treatment identifiers based on travelMUNkm.
  * id: This is the 2012-01-01-vintage data with municipality, canton, district, and region names+numbers and some characteristics measured during the 2000 census. Includes the link to the 2019-vintage muni.
  * tt: This is the 2012-01-01-vintage data with information on the travel time to municipalities. Includes the link to the 2019-vintage municipalities.
  * br: This includes all municipalities up to 2012-01-01 (unclear start date) and has an indicator for membership in the border region plus the 2019-vintange municipality indicator."
  )